This notebook is to explore Canadian SARS-CoV-2 genomic and epidemiological data with the aim to investigate viral evolution and spread. It is for discussion with pillar 6’s team and for sharing with collaborators, e.g. PH labs. These analyses can spur further research within or across pillars, be used for reports (or data dashboards), discussions with the science communication pillar for public dissemination, and the code can be used or repackaged for public health authorities/laboratories for their internal use.
Canadian genomic and epidemiological data will be regularly pulled from various public sources (see list below) to keep these analyses up-to-date. Only representations of aggregate data will be posted here. [aim: use only VirusSeq Portal data]
Caveat Because of the evolving screening and sequencing strategies over time, publicly available genomes do not necessarily represent the actual proportions of circulating lineages. Thus, we would like to highlight that the data and analyses presented here may be impacted by potential biases resulting from the variability of jurisdictional sampling, screening, sequencing methods and strategies.
## 1. LOAD processed metadata of Canadian sequences (with latest pangolin, division, and full seq IDs)
#Download metadata from gisaid, put the date here:
gisaiddate="2022_03_04"
metaCANall <- read.csv(unz("./data_needed/metadata_CANall_last.zip",paste("metadata_CANall_",gisaiddate,".csv",sep="")),sep="\t",row.names=NULL)
metaCANall$Collection_date <- as.Date(metaCANall$Collection_date)
metaCANall$province <- sapply(strsplit(metaCANall$Location,"_/_"), `[`, 3)
metaCANall$province [metaCANall$province == "Newfoundland"] <- "Newfoundland_and_Labrador"
metaCANall$province [metaCANall$province == "Labrador"] <- "Newfoundland_and_Labrador"
#make a pango.group column
VOCVOI <- data.frame(name = character(),pattern = character(),color = numeric())
VOCVOI[nrow(VOCVOI)+1, ]=list("Alpha", "B.1.1.7|Q.","#B29C71")
VOCVOI[nrow(VOCVOI)+1, ]=list("Beta", "B.1.351", "#F08C3A") #Beta
VOCVOI[nrow(VOCVOI)+1, ]=list("Gamma", "P.", "#444444")
VOCVOI[nrow(VOCVOI)+1, ]=list("Delta", "B.1.617|AY.","#A6CEE3")
VOCVOI[nrow(VOCVOI)+1, ]=list("Delta AY.25", "AY.25", "#61A6A0")
VOCVOI[nrow(VOCVOI)+1, ]=list("Delta AY.27", "AY.27", "#438FC0")
VOCVOI[nrow(VOCVOI)+1, ]=list("Lambda", "C.37|C.37.1", "#CD950C")#lambda
VOCVOI[nrow(VOCVOI)+1, ]=list("Omicron BA.1", "B.1.1.529|BA.1","#8B0000")
VOCVOI[nrow(VOCVOI)+1, ]=list("Omicron BA.1.1","BA.1.1", "#FA8072")
VOCVOI[nrow(VOCVOI)+1, ]=list("Omicron BA.2", "BA.2", "#FF0000")
VOCVOI[nrow(VOCVOI)+1, ]=list("Mu", "B.1.621", "#BB4513")#Mu
VOCVOI[nrow(VOCVOI)+1, ]=list("A.23.1", "A.23.1", "#9AD378")
VOCVOI[nrow(VOCVOI)+1, ]=list("B.1.438.1", "B.1.438.1", "#3EA534")
#make a pango.group column
metaCANall$pango.group <-"other"
pal=rainbow(0)
pal["other"]="grey"
for (row in 1:nrow(VOCVOI)) {
name <- VOCVOI[row, "name"]
pattern=gsub("\\.",".",VOCVOI[row, "pattern"])
metaCANall$pango.group[grepl(pattern, metaCANall$Pango_lineage)] <- name
pal[name]=VOCVOI[row, "color"]
}
## 2. LOAD epidemiological data (PHAC)
#from: https://health-infobase.canada.ca/covid-19/epidemiological-summary-covid-19-cases.html?stat=num&measure=total&map=pt#a2
epidataCANall <- read.csv(url("https://health-infobase.canada.ca/src/data/covidLive/covid19-download.csv"))
epidataCANall$date <- as.Date(epidataCANall$date)
epidataCANall$prname <- gsub(' ', '_', epidataCANall$prname)
epidate = tail(epidataCANall,1)$date #download date
Important caveat:
These plots show the changing composition of sequences posted to GISAID and the VirusSeq Portal by PANGO lineage/variant designation. Because sampling and sequencing procedures vary by region and time, this does not necessarily reflect the true composition of SARS-CoV-2 viruses in Canada over time. Only some infections are detected by PCR testing, and only some of those are sent for whole-genome sequencing, and not all of those are posted to public facing repositories. Sequencing volumes and priority have changed during the pandemic, and the sequencing strategy is typically a combination of prioritizing outbreaks, travellers, public health investigations, and random sampling for genomic surveillance. Thus, interpretation of these plots and comparisons between health regions should be done with caution.
# --- Histogram plot: counts per variant of all canadian data -------------
mindate=as.Date("2020-11-01")
maxdate=as.Date(gsub('_','-',gisaiddate))
maxdate=as.Date("2022-02-28")
colorcases="#138808"
plot_metadatadf <- function(meta_tab,cases) {
meta_tab = meta_tab %>% filter(!is.na(Collection_date),Collection_date > mindate,Collection_date < maxdate) %>% group_by(pango.group) %>% group_by(week = cut(Collection_date, "week")) %>% count(pango.group)
cases <- filter(cases, cases$date > mindate)
cases <- filter(cases, cases$date < maxdate)
cases <- cases %>% group_by(week = cut(date, "week")) #adds week column
cases <- data.frame(aggregate(cases$numtoday, by=list(Category=cases$week), FUN=sum))
cases$x=cases$x/7
ratio <- max(data.frame(aggregate(meta_tab$n, by=list(Category=meta_tab$week), FUN=sum))$x)/max(cases$x)
cases$x=cases$x*ratio
ggplot(meta_tab, aes(x=as.Date(week)))+
geom_bar(stat = "identity", aes(y= n, fill=pango.group))+
scale_fill_manual(breaks=unique(sort(meta_tab$pango.group)), values = pal)+
ylab("")+xlab("")+
theme_classic()+theme(axis.text.x = element_text(angle=90, vjust=0.1,hjust=0.1))+
scale_x_date(date_breaks = "28 day")+
geom_line(data=cases, aes(x=as.Date(Category),y=x),size=2,color=colorcases,alpha=0.6)+
scale_y_continuous(name = "sequenced cases per day",sec.axis = sec_axis(~./ratio, name="recorded cases per day",labels=label_number_auto()))+
theme(axis.title.y.right = element_text(color = colorcases),axis.text.y.right = element_text(color = colorcases))
}
plot_metadatadf_freq <- function(meta_tab,cases) {
meta_tab = meta_tab %>% filter(!is.na(Collection_date),Collection_date > mindate,Collection_date < maxdate) %>% group_by(pango.group) %>% group_by(week = cut(Collection_date, "week")) %>% count(pango.group)
meta_tab %>% mutate(freq = prop.table(n)) -> dfprop
cases <- filter(cases, cases$date > mindate)
cases <- filter(cases, cases$date < maxdate)
cases <- cases %>% group_by(week = cut(date, "week")) #adds week column
cases <- data.frame(aggregate(cases$numtoday, by=list(Category=cases$week), FUN=sum))
cases$x=cases$x/7
ratio <- 1/max(cases$x)
cases$x=cases$x*ratio
ggplot(dfprop, aes(x=as.Date(week)))+
geom_bar(stat = "identity", aes(y= freq, fill=pango.group))+
scale_fill_manual(breaks=unique(sort(dfprop$pango.group)), values = pal)+
ylab("")+xlab("")+
theme_classic()+theme(axis.text.x = element_text(angle=90, vjust=0.1,hjust=0.1))+
scale_x_date(date_breaks = "28 day")+
geom_line(data=cases, aes(x=as.Date(Category),y=x),size=2,color=colorcases,alpha=0.6)+
scale_y_continuous(name = "sequenced cases per day",sec.axis = sec_axis(~./ratio, name="recorded cases per day",labels=label_number_auto()))+
theme(axis.title.y.right = element_text(color = colorcases),axis.text.y.right = element_text(color = colorcases))
}
plot_metadatadf(metaCANall,epidataCANall)
plot_metadatadf_freq(metaCANall,epidataCANall)
Sequence counts for all Canadian data in GISAID and VirusSeq Portal, up to 2022-03-06. Case counts over time (rolling 7 day average) are added in green for comparison, given that sequencing coverage varies over time.
From the beginning of the pandemic to the fall of 2021, Canadian sequences were mostly of the wildtype lineages (pre-VOCs).By the beginning of summer 2021, the VOCs Alpha and Gamma were the most sequenced lineages overall in Canada. The Delta wave grew late summer with Delta sublineages AY.25 and AY.27 constituting sizable proportions of this wave. Omicron arrived in November of 2021 and currently the 3 main sublineages in Canada are BA.1, BA.1.1, and BA.2. See below for jursictional differences of these plots.
There are two PANGO lineages that have a Canadian origin and that predominately spread within Canada (with some exportations internationally): B.1.438.1 and B.1.1.176.
Other lineages of Canadian interest:
Here we show the same plots but for each province. Note that the NCCID provides a timeline of Canadian events related to each variant: https://nccid.ca/covid-19-variants/
Disclaimer:
All analyses below draw on the most recent publicly available viral sequence data and should be interpreted with caution due to lags in reporting and sequencing priorities that can differ across provinces or territories. For example, specific variants or populations might be preferentially sequenced at certain times in certain jurisdictions. When possible, these differences in sampling strategies are mentioned but they are not always known. These analyses are subject to change given new data.
With the arrival of the Omicron wave, many jurisdictions across Canada reached testing and sequencing capacity mid-late December 2021, and thus switched to targeted testing of priority groups (e.g. hospitalized patients, health care workers, and people in high-risk settings). Therefore, from this time onward, case counts are likely underestimated and the sequenced virus diversity is not necessarily representative of the virus circulating in the overall population.
Provincial sequencing strategy includes a subset of representative positive samples and prioritized cases (outbreaks, long-term care, travel-related, vaccine escape, hospitalized) More up-to-date covid data for this province can be found here:
http://www.bccdc.ca/health-info/diseases-conditions/covid-19/data-trends
prov='British_Columbia'
plot_metadatadf(metaCANall[ which(metaCANall$province==prov), ],epidataCANall[ which(epidataCANall$prname==prov), ])
plot_metadatadf_freq(metaCANall[ which(metaCANall$province==prov), ],epidataCANall[ which(epidataCANall$prname==prov), ])
More up-to-date covid data for this province can be found here:
https://www.alberta.ca/stats/covid-19-alberta-statistics.htm#variants-of-concern
prov='Alberta'
plot_metadatadf(metaCANall[ which(metaCANall$province==prov), ],epidataCANall[ which(epidataCANall$prname==prov), ])
plot_metadatadf_freq(metaCANall[ which(metaCANall$province==prov), ],epidataCANall[ which(epidataCANall$prname==prov), ])
More up-to-date covid data for this province can be found here:
https://www.saskatchewan.ca/government/health-care-administration-and-provider-resources/treatment-procedures-and-guidelines/emerging-public-health-issues/2019-novel-coronavirus/cases-and-risk-of-covid-19-in-saskatchewan
prov='Saskatchewan'
plot_metadatadf(metaCANall[ which(metaCANall$province==prov), ],epidataCANall[ which(epidataCANall$prname==prov), ])
plot_metadatadf_freq(metaCANall[ which(metaCANall$province==prov), ],epidataCANall[ which(epidataCANall$prname==prov), ])
More up-to-date covid data for this province can be found here:
https://geoportal.gov.mb.ca/apps/manitoba-covid-19/explore
prov='Manitoba'
plot_metadatadf(metaCANall[ which(metaCANall$province==prov), ],epidataCANall[ which(epidataCANall$prname==prov), ])
plot_metadatadf_freq(metaCANall[ which(metaCANall$province==prov), ],epidataCANall[ which(epidataCANall$prname==prov), ])
More up-to-date covid data for this province can be found here:
https://www.publichealthontario.ca/en/diseases-and-conditions/infectious-diseases/respiratory-diseases/novel-coronavirus/variants
prov='Ontario'
plot_metadatadf(metaCANall[ which(metaCANall$province==prov), ],epidataCANall[ which(epidataCANall$prname==prov), ])
plot_metadatadf_freq(metaCANall[ which(metaCANall$province==prov), ],epidataCANall[ which(epidataCANall$prname==prov), ])
Provincial random sequencing has been temporarily suspended as of Feb 8th, 2021. Quebec provides a list of updates on changes to screening and sequencing strategies, found here (in French): https://www.inspq.qc.ca/covid-19/donnees/variants#methodologie More up-to-date covid data for this province can be found here:
https://www.inspq.qc.ca/covid-19/donnees/variants
prov='Quebec'
plot_metadatadf(metaCANall[ which(metaCANall$province==prov), ],epidataCANall[ which(epidataCANall$prname==prov), ])
plot_metadatadf_freq(metaCANall[ which(metaCANall$province==prov), ],epidataCANall[ which(epidataCANall$prname==prov), ])
More up-to-date covid data for this province can be found here:
https://experience.arcgis.com/experience/204d6ed723244dfbb763ca3f913c5cad
prov='Nova_Scotia'
plot_metadatadf(metaCANall[ which(metaCANall$province==prov), ],epidataCANall[ which(epidataCANall$prname==prov), ])
plot_metadatadf_freq(metaCANall[ which(metaCANall$province==prov), ],epidataCANall[ which(epidataCANall$prname==prov), ])
More up-to-date covid data for this province can be found here:
https://experience.arcgis.com/experience/8eeb9a2052d641c996dba5de8f25a8aa (NB dashboard)
prov='New_Brunswick'
plot_metadatadf(metaCANall[ which(metaCANall$province==prov), ],epidataCANall[ which(epidataCANall$prname==prov), ])
plot_metadatadf_freq(metaCANall[ which(metaCANall$province==prov), ],epidataCANall[ which(epidataCANall$prname==prov), ])
More up-to-date covid data for this province can be found here:
https://covid-19-newfoundland-and-labrador-gnl.hub.arcgis.com/
prov='Newfoundland_and_Labrador'
plot_metadatadf(metaCANall[ which(metaCANall$province==prov), ],epidataCANall[ which(epidataCANall$prname==prov), ])
plot_metadatadf_freq(metaCANall[ which(metaCANall$province==prov), ],epidataCANall[ which(epidataCANall$prname==prov), ])
Here we present a subsampled phylogenetic snapshot of SARS-CoV-2 genomes from Canada. [ADD HERE A CAVEAT IF WE INCLUDE THE PROVINCIAL HEATMAP AND IF IT’S WELLMIXED OR NOT]
### metadata and trees
metasub1 <- read.delim( "./data_needed/subsampling_metadata_V1.tsv")
MLtree_sub1 <- read.tree("./data_needed/iqtree_V1_trimmed.nwk")
ttree_sub1 <- read.tree("./data_needed/time_tree_V1_trimmed.nwk")
### needed for plotting
pango_frame<-metaCANall[c("Pango_lineage","pango.group")]
pango_col<-pango_frame %>% distinct()#make a lookup table for all lineages to pango.group
metasub1<-merge(metasub1,pango_col,by.x="Variant",by.y="Pango_lineage",all.y=FALSE,all.x=TRUE)
division_frame<-metaCANall[c("Virus_name","province")]
metasub1<-merge(metasub1,division_frame,by.x="GISAID",by.y="Virus_name",all.y=FALSE,all.x=TRUE)
listpangogp<-unique(metasub1$pango.group)
listpangogp<-sort(listpangogp)
#division colours for heatmap
df_PTs <-data.frame(divisions=metasub1$province) #make df for heatmaps
rownames(df_PTs) <- metasub1$EPI #needs row names for heatmaps
#division colours
listPTs <- unique(metasub1$province)
listPTs <- listPTs[order(listPTs)]
listPTs <- listPTs[-11] #remove hubei
getpal2 <- colorRampPalette(brewer.pal(10, "Spectral")) #"Set3
pal2 <- getpal2(length(listPTs))
names(pal2) <- listPTs
### time tree
mrdate <- max(na.omit(metasub1$Date))
p1 <- ggtree(ttree_sub1, mrsd = as.Date(mrdate)) %<+% metasub1[,c("EPI", "pango.group"), drop=FALSE] +
geom_tippoint(color="black", size=1)+
geom_tippoint(aes(color = pango.group), size=1)+ #, shape= lab
scale_color_manual(breaks=listpangogp,values=pal)+
#ggtitle("Time tree - Canada - downsample1")+
coord_cartesian(clip = 'off') + theme_tree2(plot.margin=margin(6, 40, 6, 6))+
guides(color = guide_legend(override.aes = list(size = 4) ) )+
theme(legend.position = "right", legend.title = element_blank(), legend.text = element_text(size =12))
plot1<-p1#don't plot provinces
#plot1 <- gheatmap(p1, df_PTs[, "divisions", drop = FALSE], width= 0.1, color = F, colnames = F, legend_title = F) +
# scale_fill_manual(breaks=listPTs, values = pal2)
### diversity ML tree
p2 <- ggtree(MLtree_sub1) %<+% metasub1[,c("EPI", "pango.group"), drop=FALSE] +
geom_tippoint(color="black", size=1)+
geom_tippoint(aes(color = pango.group), size=1)+
scale_color_manual(breaks=listpangogp, values = pal)+
#ggtitle("ML tree - Canada - subsample1, n = 8K")+
coord_cartesian(clip = 'off') + theme_tree2(plot.margin=margin(6, 40, 6, 6))+
guides(color = guide_legend(override.aes = list(size = 4) ) )+
theme(legend.position = "right", legend.title = element_blank(), legend.text = element_text(size =12))
plot2<-p2
#plot2 <- gheatmap(p2, df_PTs[, "divisions", drop = FALSE], width= 0.1, color = F, colnames = F, legend_title = F) +
# scale_fill_manual(breaks=listPTs, values = pal2)
table(metasub1$pango.group)
##
## A.23.1 Alpha B.1.438.1 Beta Delta
## 26 193 68 65 1912
## Delta AY.25 Delta AY.27 Gamma Lambda Mu
## 139 63 136 13 13
## Omicron BA.1 Omicron BA.1.1 Omicron BA.2 other
## 373 347 78 2224
Compilation of analyses of omicron and its sublineages in Canada relative to global compositions.
#code developed by Rapahel Poujol and Susanne Kraemer
import matplotlib.pyplot as plt
import numpy as np
from data_needed.raphgraph.libs.Functions_For_MutationalGraphs import *
# percent of alt alleles to add mutation label
# File containing label
# Default is AminoAcid labels for non synonymous mutations
def_min_val_label(15)
load_mut_names("data_needed/raphgraph/libs/Mut_Nuc_AA_ORF.dic")
p="data_needed/raphgraph/msa_0220_"
namelist=["Canada.BA.1","final.BA.1","Canada.BA.1.1","final.BA.1.1","Canada.BA.2","final.BA.2","final.BA.3"]
pathlist=[p+i+".var" for i in namelist]
namelist=[i.replace("_","\n") for i in namelist]
tablelist=openfiles(pathlist)
poslist=getpositions(tablelist,percentmin=75,addmissing=False)
poslist=[i for i in poslist if i>50 and i<29950]
bighist(tablelist,poslist,namelist,mytitle="Omicron mutations: Canada and worldwide")
plt.show()
Mutational profile (point mutations>75%) of Omicron and its sublineages in Canada and global (based on all genomes available on GISAID on the [ADD CORRECT DATE]).
There are various methods to investigate changes in evolutionary rates of VOC/VOIs and to compare their relative fitness in an epidemiological context. Here we present two of such methods.
Of particular interest when there is any newly arriving or emerging lineage is whether it has a selective advantage (and by how much) relative to the lineages already circulating in the population. Given the current national dominance of Omicron, the estimator is applied to the sublineages of Omicron. [ADD HERE HOW TO READ THE VALUES, AND MAYBE A PREVIOUS VOC AT EMERGENCE TO COMPARE/BASELINE]
Caveat Selection coefficients are not estimated for Alberta, which is currently taking a variant-specific sequencing strategy, based on an initial PCR screen, which would skew estimates calculated here. Canada wide estimates, thus, do not include this province.
name1<-"BA.1" #Can include a list here (first value is used as plot labels)
name2<-"BA.1.1" #Can include a list here (first value is used as plot labels)
name3<-"BA.2" #Can include a list here (first value is used as plot labels)
#Set a starting date
#Note that the startdate shouldn't be too much before both alleles become common
#or rare migration events that die off could throw off the estimation procedure
#(so that the parameter estimates account for the presence of those alleles long in the past).
startdate<-as.Date("2021-12-15") #Using a later date with less sampling noise
plot_selection_estimator <- function(prov,startdate,name1,name2,name3) {
mydata=metaCANall %>% filter(grepl("BA.", Pango_lineage), province == prov, !is.na(Collection_date), Collection_date >= startdate) %>% group_by(Collection_date) %>% count(Pango_lineage)
if(prov=="East provinces (NL+NS+NB+ON+QC)"){
mydata= metaCANall %>% filter(grepl("BA.", Pango_lineage), province %in% list("Nova_Scotia","New_Brunswick","Newfoundland_and_Labrador","Quebec","Ontario"), !is.na(Collection_date), Collection_date >= startdate) %>% group_by(Collection_date) %>% count(Pango_lineage)
}
if(prov=="Canada (no AB)"){
mydata=metaCANall %>% filter(grepl("BA.", Pango_lineage), province != "Alberta", !is.na(Collection_date), Collection_date >= startdate) %>% group_by(Collection_date) %>% count(Pango_lineage)
}
#Set the final date:
lastdate<-max(mydata$Collection_date)
#convert time to an integer counter for use in fitting, first using the last date as time 0:
mydata$time = as.numeric(difftime(mydata$Collection_date, lastdate, units = "days"))
#filter data to after that starting date
data1 <- filter(mydata, Pango_lineage %in% name1)
data2 <- filter(mydata, Pango_lineage %in% name2)
data3 <- filter(mydata, Pango_lineage %in% name3)
#allow multiple Pango lineages to be combined if name1 or name2 includes a list, summing n
data1 <- as.data.frame(unique(data1 %>% group_by(time) %>% transmute(day=Collection_date, n=sum(n), time=time)))
data2 <- as.data.frame(unique(data2 %>% group_by(time) %>% transmute(day=Collection_date, n=sum(n), time=time)))
data3 <- as.data.frame(unique(data3 %>% group_by(time) %>% transmute(day=Collection_date, n=sum(n), time=time)))
name1 <- name1[[1]]
name2 <- name2[[1]]
name3 <- name3[[1]]
data1$Pango_lineage <- name1
data2$Pango_lineage <- name2
data3$Pango_lineage <- name3
#join lists in a dataframe to plot proportions and represent time as a list of integers
timestart<-as.numeric(difftime(startdate, lastdate, units = "days"))
timeend<-as.numeric(difftime(lastdate, lastdate, units = "days"))
toplot <- data.frame(time = seq.int(timestart,timeend))
toplot$n1 <- data1$n[match(toplot$time,data1$time)]
toplot$n2 <- data2$n[match(toplot$time,data2$time)]
toplot$n3 <- data3$n[match(toplot$time,data3$time)]
toplot[is.na(toplot)] = 0 #Any NA's refer to no variant of that type on a day, set to zero
#To aid in the ML search, we rescale time to be centered as close as possible
#to the midpoint for the second variable (p=0.5), to make sure that the alleles
#are segregating at the reference date.
#If we set t=0 when p is near 0 or 1, then the likelihood surface is very flat.
v=toplot$n1*toplot$n2*toplot$n3
refdate<-which(v==max(v,na.rm=TRUE))
refdate<-refdate[[1]] #Just in case there is more than one matching point, the first is taken
timeend <- (timeend-timestart)-refdate
timestart <- -refdate
toplot$time <- seq.int(timestart,timeend)
data1$time <- data1$time + (timeend-timestart)-refdate
data2$time <- data2$time + (timeend-timestart)-refdate
data3$time <- data3$time + (timeend-timestart)-refdate
#date converter
dateseq <- seq.Date(startdate,lastdate,"days")
dateconverter <- data.frame(time=toplot$time,date=dateseq)
# plot(y=toplot$n2/(toplot$n1+toplot$n2+toplot$n3),x=toplot$time,xlab="Time",ylab="proportion",ylim=c(0,1),col=col2)
# points(y=toplot$n3/(toplot$n1+toplot$n2+toplot$n3),x=toplot$time,xlab="Time",ylab="proportion",cex=0.5,col=col3)
#With time started in this way, we can use 0.5 as the frequency at t=0 (startp):
startp <- 0.5
#To get an estimate for the initial p value to try, we average the last 10 points before refdate
#tempforp <- which(0 == data2$time)
#startp <- mean(toplot$n2[(tempforp-9):tempforp])/(mean(toplot$n1[(tempforp-9):tempforp])+mean(toplot$n2[(tempforp-9):tempforp]))
##############################
# Likelihood with two types
##############################
################################
# Using mle2 and profile in BBMLE
################################
#Alternatively, it looks like the BBMLE package performs well and gives
#confidence intervals for the parameters. Here, we have to flip the sign
#of the log-likelihood directly for use with mle2 (can't send control=list(fnscale=-1) through?).
trifunc <- function(p2,p3,s2,s3){
-(sum(data1$n*log((1-p2-p3)/((1-p2-p3)+p2*exp(s2*data1$time)+p3*exp(s3*data1$time))))+
sum(data2$n*log(p2*exp(s2*data2$time)/((1-p2-p3)+p2*exp(s2*data2$time)+p3*exp(s3*data2$time))))+
sum(data3$n*log(p3*exp(s3*data3$time)/((1-p2-p3)+p2*exp(s2*data3$time)+p3*exp(s3*data3$time)))))
}
startpar<-list(p2=startp, p3=0.05, s2=0.1, s3=0.1)
bbml<-mle2(trifunc, start = startpar)
bbml
#lnL
bbml.value<--bbml@min
#These confidence intervals are similar (I PREFER uniroot based on the profile likelihood procedure)
#confint(bbml) # based on inverting a spline fit to the profile
#confint(bbml,method="quad") # based on the quadratic approximation at the maximum likelihood estimate
myconf<-confint(bbml,method="uniroot") # based on root-finding to find the exact point where the profile crosses the critical level
myconf
#Interesting way of profiling the likelihood
#bbprofile<-profile(bbml)
#plot(bbprofile)
#proffun(bbml)
################################
# Drawing random parameters for CI
################################
#We can draw random values for the parameters from the Hessian to determine the
#variation in {p,s} combinations consistent with the data using RandomFromHessianOrMCMC.
#We can also generate confidence intervals accounting for uncertainty in all parameters
#by drawing from the covariance matrix estimated from the Hessian (the matrix of double derivatives
#describing the curvature of the likelihood surface near the ML peak).
bbfit<-c(p2=bbml@details[["par"]][["p2"]],p3=bbml@details[["par"]][["p3"]],
s2=bbml@details[["par"]][["s2"]],s3=bbml@details[["par"]][["s3"]])
bbfit
bbhessian<-bbml@details[["hessian"]]
colnames(bbhessian) <- c("p2","p3","s2","s3")
rownames(bbhessian) <- c("p2","p3","s2","s3")
bbhessian
df <- RandomFromHessianOrMCMC(Hessian=(bbhessian),
fitted.parameters=bbfit,
method="Hessian",replicates=1000)$random
#Once we get the set of {p,s} values, we can run them through the s-shaped curve of selection
scurve1 <- function(p2,p3,s2,s3){
(p2*exp(s2*toplot$time)/((1-p2-p3)+p2*exp(s2*toplot$time)+p3*exp(s3*toplot$time)))
}
scurve2 <- function(p2,p3,s2,s3){
(p3*exp(s3*toplot$time)/((1-p2-p3)+p2*exp(s2*toplot$time)+p3*exp(s3*toplot$time)))
}
#Generating a list of frequencies at each time point given each {p,s} combination
#NOTE - We could run more time points, if projections into the future were desired just by
#extending toplot$time
setofcurves2 <- t(mapply(scurve1,df[,1],df[,2],df[,3],df[,4]))
setofcurves3 <- t(mapply(scurve2,df[,1],df[,2],df[,3],df[,4]))
#95% innerquantiles
lowercurve1 <- c()
uppercurve1 <- c()
lowercurve2 <- c()
uppercurve2 <- c()
for (tt in 1:length(toplot$time)) {
lower1<-quantile(setofcurves2[,tt],0.025)
upper1<-quantile(setofcurves2[,tt],0.975)
lowercurve1<-append(lowercurve1,lower1)
uppercurve1<-append(uppercurve1,upper1)
lower2<-quantile(setofcurves3[,tt],0.025)
upper2<-quantile(setofcurves3[,tt],0.975)
lowercurve2<-append(lowercurve2,lower2)
uppercurve2<-append(uppercurve2,upper2)
}
#add date column
toplot$date <- dateconverter$date
#A graph with 95% quantiles based on the Hessian draws.
col2=pal[paste0("Omicron ",name2)]
col3=pal[paste0("Omicron ",name3)]
#png(file=paste0("curves_",i,".png"))
plot(y=uppercurve1,x=toplot$date,type="l",xlab="",ylab=paste0("proportion in ",prov),ylim=c(0,1))
points(y=toplot$n2/(toplot$n1+toplot$n2+toplot$n3),x=toplot$date,pch=21, col = "black", bg = alpha(col2, 0.7), cex=sqrt(toplot$n2)/3)#cex=(toplot$n2/log(10))/20)#cex=toplot$n2/50 )#cex = 0.5)
points(y=toplot$n3/(toplot$n1+toplot$n2+toplot$n3),x=toplot$date,pch=21, col = "black", bg = alpha(col3, 0.7), cex=sqrt(toplot$n3)/3)#, cex=(toplot$n3/log(10))/20) #cex=toplot$n3/50 )#cex = 0.5)
polygon(c(toplot$date, rev(toplot$date)), c(lowercurve1, rev(uppercurve1)),col = alpha(col2, 0.5))
polygon(c(toplot$date, rev(toplot$date)), c(lowercurve2, rev(uppercurve2)),col = alpha(col3, 0.5))
lines(y=(bbfit[["p2"]]*exp(bbfit[["s2"]]*toplot$time)/((1-bbfit[["p2"]]-bbfit[["p3"]])+bbfit[["p2"]]*exp(bbfit[["s2"]]*toplot$time)+bbfit[["p3"]]*exp(bbfit[["s3"]]*toplot$time))),
x=toplot$date,type="l")
lines(y=(bbfit[["p3"]]*exp(bbfit[["s3"]]*toplot$time)/((1-bbfit[["p2"]]-bbfit[["p3"]])+bbfit[["p2"]]*exp(bbfit[["s2"]]*toplot$time)+bbfit[["p3"]]*exp(bbfit[["s3"]]*toplot$time))),
x=toplot$date,type="l")
str2=sprintf("%s: %s {%s, %s}",name2,format(round(bbfit[["s2"]],3),nsmall=3),format(round(myconf[3],3),nsmall=3),format(round(myconf[7],3),nsmall=3))
str3=sprintf("%s: %s {%s, %s}",name3,format(round(bbfit[["s3"]],3),nsmall=3),format(round(myconf[4],3),nsmall=3),format(round(myconf[8],3),nsmall=3))
text(x=toplot$date[1],y=0.95,str2,col = col2,pos=4, cex = 1)
text(x=toplot$date[1],y=0.87,str3,col = col3,pos=4, cex = 1)
#dev.off()
################################
# Looking for a breakpoint
################################
# Visually looking for breaks in the slope over time (formal breakpoint search
# is in the two variant code).
#png(file=paste0("logit_",i, ".png"))
options( scipen = 5 )
plot(y=toplot$n2/(toplot$n1+toplot$n2+toplot$n3),x=toplot$date, pch=21, col = "black", bg = alpha(col2, 0.7), cex=sqrt(toplot$n2)/3,
log="y",ylim=c(0.001,1000), yaxt = "n", xlab="",ylab=paste0("logit in ",prov))
points(y=toplot$n3/(toplot$n1+toplot$n2+toplot$n3),x=toplot$date, pch=21, col = "black", bg = alpha(col3, 0.7), cex=sqrt(toplot$n3)/3)
lines(y=(bbfit[["p2"]]*exp(bbfit[["s2"]]*toplot$time)/((1-bbfit[["p2"]]-bbfit[["p3"]])+bbfit[["p2"]]*exp(bbfit[["s2"]]*toplot$time)+bbfit[["p3"]]*exp(bbfit[["s3"]]*toplot$time))),
x=toplot$date, type="l",col="black")#, col=col2)
lines(y=(bbfit[["p3"]]*exp(bbfit[["s3"]]*toplot$time)/((1-bbfit[["p2"]]-bbfit[["p3"]])+bbfit[["p2"]]*exp(bbfit[["s2"]]*toplot$time)+bbfit[["p3"]]*exp(bbfit[["s3"]]*toplot$time))),
x=toplot$date, type="l",col = "black")#, col=col3)
str2=sprintf("%s: %s {%s, %s}",name2,format(round(bbfit[["s2"]],3),nsmall=3),format(round(myconf[3],3),nsmall=3),format(round(myconf[7],3),nsmall=3))
str3=sprintf("%s: %s {%s, %s}",name3,format(round(bbfit[["s3"]],3),nsmall=3),format(round(myconf[4],3),nsmall=3),format(round(myconf[8],3),nsmall=3))
text(x=toplot$date[1],y=500,str2,col = col2,pos=4, cex = 1)
text(x=toplot$date[1],y=200,str3,col = col3,pos=4, cex = 1)
axis(2, at=c(0.001,0.01,0.1,1,10,100,1000), labels=c(0.001,0.01,0.1,1,10,100,1000))
#dev.off()
#Bends suggest a changing selection over time (e.g., due to the impact of vaccinations
#differentially impacting the variants). Sharper turns are more often due to NPI measures.
}
#####################################################
# tabs for displaying in notebook
#each PT tab should have curve plot and breakpoint plot side by side
## Estimation using variance-covariance matrix
## Still 1000 replicates
## Estimation using variance-covariance matrix
## Still 1000 replicates
## Estimation using variance-covariance matrix
## Still 1000 replicates
## Estimation using variance-covariance matrix
## Still 1000 replicates
Root-to-tip plots give estimates of substitution rates. Clusters with stronger positive slopes than the average SARS-CoV-2 dataset, are an indication that they are accumulating mutations at a faster pace, or clusters that jump up above the average could indicate a mutational jump (similar to what we saw with Alpha when it first appeared in the UK).
#RTT code contributed by Art Poon
rooted <- MLtree_sub1
metadataRTT <- metasub1
vocs<-names(pal)
name_list<-rooted$tip.label
index1<-match(name_list,metadataRTT$EPI)
date <- metadataRTT$Date[index1]
pg <- metadataRTT$pango.group[index1]
date<-as.Date(date)
# total branch length from root to each tip
div <- node.depth.edgelength(rooted)[1:Ntip(rooted)]
blobs <- function(x, y, col, cex=1) {
points(x, y, pch=21, cex=cex)
points(x, y, bg=col, col=rgb(0,0,0,0), pch=21, cex=cex)
}
dlines <- function(x, y, col) {
lines(x, y, lwd=2.5)
lines(x, y, col=col)
}
par(mar=c(5,5,0,1))
plot(date, div, type='n', las=1, cex.axis=0.6, cex.lab=0.7, bty='n',
xaxt='n', xlab="Sample collection date", ylab="Divergence from root")
xx <- floor_date(seq(min(date), max(date), length.out=5), unit="months")
axis(side=1, at=xx, label=format(xx, "%b %Y"), cex.axis=0.6)
blobs(date[pg=='other'], div[pg=='other'], col='grey', cex=1)
fit0 <- rlm(div[pg=='other'] ~ date[pg=='other'])
abline(fit0, col='gray50')
fits <- list(other=fit0)
for (i in 1:length(vocs)) {
variant <- vocs[i]
x <- date[pg==variant]
if (all(is.na(x))) next
y <- div[pg==variant]
blobs(x, y, col=pal[i], cex=0.8)
fit <- rlm(y ~ x)
dlines(fit$x[,2], predict(fit), col=pal[i])
fits[[variant]] <- fit
}
legend(x=min(date), y=max(div), legend=vocs, pch=21, pt.bg=pal, bty='n', cex=0.8)
ci <- lapply(fits, confint.default)
kable(data.frame(
n=sapply(fits, function(f) nrow(f$x)),
est=29970*sapply(fits, function(f) f$coef[2]),
lower.95=29970*sapply(ci, function(f) f[2,1]),
upper.95=29970*sapply(ci, function(f) f[2,2])
),
col.names = c("Number of genomes", "Estimate", "Lower 95% CI", "Upper 95% CI"),
format="html", align="rrrr", caption="Molecular clock rates (subs/genome/day)",
format.args = list(scientific = FALSE), digits=4, table.attr = "style='width:100%;'")
| Number of genomes | Estimate | Lower 95% CI | Upper 95% CI | |
|---|---|---|---|---|
| other | 2202 | 0.0533 | 0.0519 | 0.0547 |
| Alpha | 187 | 0.0734 | 0.0679 | 0.0789 |
| Beta | 65 | 0.0265 | 0.0173 | 0.0356 |
| Gamma | 136 | 0.0660 | 0.0552 | 0.0768 |
| Delta | 1909 | 0.0539 | 0.0507 | 0.0571 |
| Delta AY.25 | 139 | 0.0405 | 0.0347 | 0.0463 |
| Delta AY.27 | 63 | 0.0434 | 0.0360 | 0.0508 |
| Lambda | 13 | 0.0241 | 0.0026 | 0.0457 |
| Omicron BA.1 | 373 | 0.0389 | 0.0287 | 0.0491 |
| Omicron BA.1.1 | 346 | 0.0288 | 0.0204 | 0.0372 |
| Omicron BA.2 | 78 | 0.0372 | 0.0110 | 0.0634 |
| Mu | 13 | 0.0251 | -0.0230 | 0.0732 |
| A.23.1 | 26 | 0.0639 | 0.0382 | 0.0896 |
| B.1.438.1 | 67 | 0.0413 | 0.0333 | 0.0493 |
The evolutionary rate among genomes sampled in Canada is (0.0913624 subs/genome/day, grey line). For comparison, the global average of 0.0674941 subs/genome/day [ADD REF - nextstrain estimate?].
We are in the process of adding or would like to develop code for some of the following analyses:
* dN/dS (by variant and by gene/domains)
* Tajima’s D over time
* clustering analyses * genomically inferred epidemiological parameters: R0, serial interval, etc.
With anonymized data on vaccination status, severity/outcome, reason for sequencing (e.g. outbreak, hospitalization, or general sampling), and setting (workplace, school, daycare, LTC, health institution, other), we could analyze genomic characteristics of the virus relative to the epidemiological and immunological conditions in which it is spreading and evolving. Studies on mutational correlations to superspreading events, vaccination status, or comparisons between variants would allow us to better understand transmission and evolution in these environments.
Canadian genomes were obtained from the GISAID msa on the [ADD CORRECT DATE HERE] and down-sampled to one genome per lineage, province and month + one genome of each omicron sublineage per province and month (n ~ 5500 genomes). The ML tree was reconstructed based on the GISAID alignment of these genomes using IQ-TREE. Outliers were identified in Tempest and removed from the dataset. Tree time was used to estimate a time tree TreeTime [ADD HERE ASSUMPTIONS, E.G. STRICT OR RELAXED CLOCK?], and trees were plotted with ggtree.
[SALLY ADDS HERE METHODS HERE]
Maximum likelihood tree (IQ-TREE) processed with root-to-tip regression and plotting in R.
[EXPLAIN HOW RAPH-GRAPH IS GENERATED]
Collect a list of bioinformatics, phylogenetic, and modelling tools that are useful for SARS-CoV-2 analyses:
We thank all the authors, developers, and contributors to the GISAID and VirusSeq database for making their SARS-CoV-2 sequences publicly available. We especially thank the Canadian Public Health Laboratory Network, academic sequencing partners, diagnostic hospital labs, and other sequencing partners for the provision of the Canadian sequence data used in this work. Genome sequencing in Canada was supported by a Genome Canada grant to the Canadian COVID-19 Genomic Network (CanCOGeN).
We gratefully acknowledge all the Authors, the Originating laboratories responsible for obtaining the specimens, and the Submitting laboratories for generating the genetic sequence and metadata and sharing via the GISAID Initiative, on which this research is based.
GISAID (https://www.gisaid.org/) We would like to thank the GISAID Initiative and are grateful to all of the data contributors, i.e. the Authors, the Originating laboratories responsible for obtaining the specimens, and the Submitting laboratories for generating the genetic sequence and metadata and sharing via the GISAID Initiative, on which this research is based. Elbe, S., and Buckland-Merrett, G. (2017) Data, disease and diplomacy: GISAID’s innovative contribution to global health. Global Challenges, 1:33-46. DOI:10.1002/gch2.1018, PMCID:31565258 Shu, Y., McCauley, J. (2017) GISAID: From vision to reality. EuroSurveillance, 22(13), DOI: 10.2807/1560-7917.ES.2017.22.13.30494, PMCID: PMC5388101
The Canadian VirusSeq Data Portal (https://virusseq-dataportal.ca) We wish to acknowledge the following organisations/laboratories for contributing data to the Portal: Canadian Public Health Laboratory Network (CPHLN), CanCOGGeN VirusSeq, Saskatchewan - Roy Romanow Provincial Laboratory (RRPL), Nova Scotia Health Authority, Alberta ProvLab North (APLN), Queen’s University / Kingston Health Sciences Centre, National Microbiology Laboratory (NML), Institut National de Sante Publique du Quebec (INSPQ), BCCDC Public Health Laboratory, Public Health Ontario (PHO), Newfoundland and Labrador - Eastern Health, Unity Health Toronto, Ontario Institute for Cancer Research (OICR), Provincial Public Health Laboratory Network of Nova Scotia, Centre Hospitalier Universitaire Georges L. Dumont - New Brunswick, and Manitoba Cadham Provincial Laboratory. Please see the complete list of laboratories included in this repository.
Public Health Agency of Canada (PHAC) / National Microbiology Laboratory (NML) - (https://health-infobase.canada.ca/covid-19/epidemiological-summary-covid-19-cases.html)
various provincial public health websites (e.g. INSPQ https://www.inspq.qc.ca/covid-19/donnees/)
The version numbers of all packages in the current environment as well as information about the R install is reported below.
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] HelpersMG_5.1 coda_0.19-4 lme4_1.1-28 Matrix_1.4-0
## [5] bbmle_1.0.24 MASS_7.3-55 viridis_0.6.2 viridisLite_0.4.0
## [9] colorspace_2.0-2 RColorBrewer_1.1-2 scales_1.1.1 kableExtra_1.3.4
## [13] gridExtra_2.3 ggbeeswarm_0.6.0 DT_0.20 cowplot_1.1.1
## [17] ggtree_3.2.1 phytools_0.7-90 maps_3.4.0 phangorn_2.8.0
## [21] tidytree_0.3.6 phylotools_0.2.2 ape_5.5 treeio_1.18.1
## [25] lubridate_1.8.0 reticulate_1.22 knitr_1.37 forcats_0.5.1
## [29] stringr_1.4.0 dplyr_1.0.8 purrr_0.3.4 readr_2.1.2
## [33] tidyr_1.2.0 tibble_3.1.6 ggplot2_3.3.5 tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] minqa_1.2.4 ellipsis_0.3.2 fs_1.5.2
## [4] aplot_0.1.1 rstudioapi_0.13 farver_2.1.0
## [7] mvtnorm_1.1-3 fansi_1.0.2 xml2_1.3.3
## [10] splines_4.1.2 codetools_0.2-18 mnormt_2.0.2
## [13] jsonlite_1.8.0 nloptr_2.0.0 broom_0.7.12
## [16] dbplyr_2.1.1 png_0.1-7 compiler_4.1.2
## [19] httr_1.4.2 backports_1.4.1 assertthat_0.2.1
## [22] fastmap_1.1.0 lazyeval_0.2.2 cli_3.2.0
## [25] htmltools_0.5.2 tools_4.1.2 igraph_1.2.9
## [28] gtable_0.3.0 glue_1.6.2 clusterGeneration_1.3.7
## [31] rappdirs_0.3.3 fastmatch_1.1-3 Rcpp_1.0.8
## [34] cellranger_1.1.0 jquerylib_0.1.4 vctrs_0.3.8
## [37] svglite_2.1.0 nlme_3.1-153 xfun_0.29
## [40] rvest_1.0.2 lifecycle_1.0.1 hms_1.1.1
## [43] parallel_4.1.2 expm_0.999-6 yaml_2.3.5
## [46] ggfun_0.0.4 yulab.utils_0.0.4 sass_0.4.0
## [49] bdsmatrix_1.3-4 stringi_1.7.6 highr_0.9
## [52] plotrix_3.8-2 boot_1.3-28 rlang_1.0.1
## [55] pkgconfig_2.0.3 systemfonts_1.0.4 evaluate_0.15
## [58] lattice_0.20-45 labeling_0.4.2 patchwork_1.1.1
## [61] htmlwidgets_1.5.4 tidyselect_1.1.2 magrittr_2.0.2
## [64] R6_2.5.1 generics_0.1.2 combinat_0.0-8
## [67] DBI_1.1.2 pillar_1.7.0 haven_2.4.3
## [70] withr_2.4.3 scatterplot3d_0.3-41 modelr_0.1.8
## [73] crayon_1.5.0 utf8_1.2.2 tmvnsim_1.0-2
## [76] tzdb_0.2.0 rmarkdown_2.11 grid_4.1.2
## [79] readxl_1.3.1 webshot_0.5.2 reprex_2.0.1
## [82] digest_0.6.29 numDeriv_2016.8-1.1 gridGraphics_0.5-1
## [85] munsell_0.5.0 beeswarm_0.4.0 ggplotify_0.1.0
## [88] vipor_0.4.5 bslib_0.3.1 quadprog_1.5-8